{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c8bc0a80",
   "metadata": {},
   "source": [
    "# Example: Optimizing LLMs to Satisfy a Judge with Hidden Preferences\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87452cc5",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0e78a32-9c0d-40e6-84fd-a7f497f98092",
   "metadata": {},
   "outputs": [],
   "source": [
    "import asyncio\n",
    "import random\n",
    "\n",
    "import altair as alt\n",
    "import pandas as pd\n",
    "from tensorzero import AsyncTensorZeroGateway\n",
    "from tqdm.asyncio import tqdm_asyncio"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f12f651d",
   "metadata": {},
   "source": [
    "> **IMPORTANT:** Update the gateway URL below if you're not using the standard setup provided in this example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c815e24-0396-4942-a729-1b0b88ae0428",
   "metadata": {},
   "outputs": [],
   "source": [
    "TENSORZERO_GATEWAY_URL = \"http://localhost:3000\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d814826",
   "metadata": {},
   "source": [
    "## Load the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39f564ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_TRAIN_DATAPOINTS = 500\n",
    "NUM_VAL_DATAPOINTS = 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ef80de5",
   "metadata": {},
   "outputs": [],
   "source": [
    "random.seed(0)  # Set seed for reproducibility\n",
    "\n",
    "\n",
    "with open(\"data/nounlist.txt\", \"r\") as file:\n",
    "    topics = [line.strip() for line in file]\n",
    "    random.shuffle(topics)\n",
    "\n",
    "print(f\"There are {len(topics)} topics in the list of haiku topics.\")\n",
    "\n",
    "train_topics = topics[:NUM_TRAIN_DATAPOINTS]\n",
    "val_topics = topics[NUM_TRAIN_DATAPOINTS : NUM_TRAIN_DATAPOINTS + NUM_VAL_DATAPOINTS]\n",
    "\n",
    "print(f\"Using {len(train_topics)} topics for training and {len(val_topics)} topics for validation.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "420e6e58",
   "metadata": {},
   "source": [
    "## Inference: Write and Judge Haikus"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "77c8c645",
   "metadata": {},
   "source": [
    "> **IMPORTANT:** Reduce the number of concurrent requests if you're running into rate limits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "613216e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_CONCURRENT_REQUESTS = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6dad2676",
   "metadata": {},
   "outputs": [],
   "source": [
    "tensorzero_client = await AsyncTensorZeroGateway.build_http(gateway_url=TENSORZERO_GATEWAY_URL, timeout=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d599b0d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "async def write_judge_haiku(topic, variant_name):\n",
    "    # Generate a haiku about the given topic\n",
    "    try:\n",
    "        write_result = await tensorzero_client.inference(\n",
    "            function_name=\"write_haiku\",\n",
    "            variant_name=variant_name,  # only used during validation\n",
    "            input={\n",
    "                \"messages\": [\n",
    "                    {\n",
    "                        \"role\": \"user\",\n",
    "                        \"content\": [{\"type\": \"text\", \"value\": {\"topic\": topic}}],\n",
    "                    }\n",
    "                ]\n",
    "            },\n",
    "        )\n",
    "    except Exception as e:\n",
    "        print(f\"Error occurred: {type(e).__name__}: {e}\")\n",
    "        return None\n",
    "\n",
    "    # The LLM is instructed to conclude with the haiku, so we extract the last 3 lines\n",
    "    # In a real application, you'll want more sophisticated validation and parsing logic\n",
    "    haiku_text = \"\\n\".join(write_result.content[0].text.strip().split(\"\\n\")[-3:])\n",
    "\n",
    "    # Judge the haiku using a separate TensorZero function\n",
    "    # We use the same episode_id to associate these inferences\n",
    "    try:\n",
    "        judge_result = await tensorzero_client.inference(\n",
    "            function_name=\"judge_haiku\",\n",
    "            input={\n",
    "                \"messages\": [\n",
    "                    {\n",
    "                        \"role\": \"user\",\n",
    "                        \"content\": [\n",
    "                            {\n",
    "                                \"type\": \"text\",\n",
    "                                \"value\": {\"topic\": topic, \"haiku\": haiku_text},\n",
    "                            }\n",
    "                        ],\n",
    "                    }\n",
    "                ]\n",
    "            },\n",
    "            episode_id=write_result.episode_id,\n",
    "        )\n",
    "\n",
    "        score = judge_result.output.parsed[\"score\"]\n",
    "    except Exception as e:\n",
    "        print(f\"Error occurred: {type(e).__name__}: {e}\")\n",
    "        return None\n",
    "\n",
    "    return (write_result.inference_id, score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0201056e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run inference in parallel to speed things up\n",
    "semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)\n",
    "\n",
    "\n",
    "async def ratelimited_write_judge_haiku(topic, variant_name=None):\n",
    "    async with semaphore:\n",
    "        return await write_judge_haiku(topic, variant_name=variant_name)\n",
    "\n",
    "\n",
    "results = await tqdm_asyncio.gather(*[ratelimited_write_judge_haiku(topic) for topic in train_topics])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4d888f1",
   "metadata": {},
   "source": [
    "## Send Feedback"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdb56a67",
   "metadata": {},
   "outputs": [],
   "source": [
    "async def send_haiku_feedback(inference_id, score):\n",
    "    async with semaphore:\n",
    "        await tensorzero_client.feedback(metric_name=\"haiku_score\", inference_id=inference_id, value=score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3561fa6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "await tqdm_asyncio.gather(*[send_haiku_feedback(*result) for result in results if result is not None]);"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6ca41f3",
   "metadata": {},
   "source": [
    "## Validation Set"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "95b79a16",
   "metadata": {},
   "source": [
    "> **IMPORTANT:** Update the list below when you create new variants in `tensorzero.toml`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b05ee254",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Include the variants in `tensorzero.toml` that we want to evaluate\n",
    "VARIANTS_TO_EVALUATE = [\n",
    "    \"gpt_4o_mini\",\n",
    "    # \"gpt_4o_mini_fine_tuned\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44d9244a",
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = {}  # variant_name => score\n",
    "\n",
    "\n",
    "for variant_name in VARIANTS_TO_EVALUATE:\n",
    "    # Run inference on the validation set\n",
    "    val_results = await tqdm_asyncio.gather(\n",
    "        *[\n",
    "            ratelimited_write_judge_haiku(\n",
    "                topic,\n",
    "                variant_name=variant_name,  # pin to the specific variant we want to evaluate\n",
    "            )\n",
    "            for topic in val_topics\n",
    "        ],\n",
    "        desc=f\"Evaluating variant: {variant_name}\",\n",
    "    )\n",
    "\n",
    "    # Compute the average score for the variant\n",
    "    scores[variant_name] = sum(result[1] for result in val_results if result is not None) / len(val_results)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ebcaa08",
   "metadata": {},
   "source": [
    "## Plot Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fe172c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build a dataframe for plotting\n",
    "scores_df = []\n",
    "\n",
    "for variant_name, variant_score in scores.items():\n",
    "    scores_df.append(\n",
    "        {\n",
    "            \"Variant\": variant_name,\n",
    "            \"Metric\": \"haiku_score\",\n",
    "            \"Score\": variant_score,\n",
    "        }\n",
    "    )\n",
    "\n",
    "scores_df = pd.DataFrame(scores_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05f7c7ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the chart\n",
    "chart = (\n",
    "    alt.Chart(scores_df)\n",
    "    .encode(\n",
    "        x=alt.X(\"Score:Q\", axis=alt.Axis(format=\"%\"), scale=alt.Scale(domain=[0, 1])),\n",
    "        y=\"Variant:N\",\n",
    "        color=\"Metric:N\",\n",
    "        text=alt.Text(\"Score:Q\", format=\".1%\"),\n",
    "    )\n",
    "    .properties(title=\"Score by Variant\")\n",
    ")\n",
    "\n",
    "chart = chart.mark_bar() + chart.mark_text(align=\"left\", dx=2)\n",
    "\n",
    "chart"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
